In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
In [4]:
df = pd.read_csv(r"C:\Users\syed sahel\Downloads\Retail_sales.csv")
df.head()
Out[4]:
Store ID Product ID Date Units Sold Sales Revenue (USD) Discount Percentage Marketing Spend (USD) Store Location Product Category Day of the Week Holiday Effect
0 Spearsland 52372247 2022-01-01 9 2741.69 20 81 Tanzania Furniture Saturday False
1 Spearsland 52372247 2022-01-02 7 2665.53 0 0 Mauritania Furniture Sunday False
2 Spearsland 52372247 2022-01-03 1 380.79 0 0 Saint Pierre and Miquelon Furniture Monday False
3 Spearsland 52372247 2022-01-04 4 1523.16 0 0 Australia Furniture Tuesday False
4 Spearsland 52372247 2022-01-05 2 761.58 0 0 Swaziland Furniture Wednesday False

Data Cleaning & Preprocessing

In [7]:
# Show missing values
df.isnull().sum()

# Fill numeric missing values with median
num_cols = ["Units Sold", "Sales Revenue (USD)", "Discount Percentage", "Marketing Spend (USD)", "Holiday Effect"]
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

# Fill categorical missing values with mode
cat_cols = ["Store ID", "Product ID", "Store Location", "Product Category", "Day of the Week"]
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Convert Date column
df["Date"] = pd.to_datetime(df["Date"])

# Extract date features
df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["Day"] = df["Date"].dt.day
df["Week"] = df["Date"].dt.isocalendar().week
In [8]:
plt.figure(figsize=(12,5))
plt.plot(df["Date"], df["Sales Revenue (USD)"])
plt.title("Sales Revenue Over Time")
plt.xlabel("Date")
plt.ylabel("Revenue (USD)")
plt.show()

Units Sold by Product Category

In [9]:
category_sales = df.groupby("Product Category")["Units Sold"].sum()

plt.figure(figsize=(8,5))
sns.barplot(x=category_sales.index, y=category_sales.values)
plt.title("Units Sold by Category")
plt.xticks(rotation=45)
plt.show()

📊 Revenue by Store Location

In [35]:
location_revenue = df.groupby("Store Location")["Sales Revenue (USD)"].sum()

plt.figure(figsize=(15,5))
sns.barplot(x=location_revenue.index, y=location_revenue.values)
plt.title("Revenue by Store Location")
plt.xticks(rotation=45)
plt.show()

📊 Discount vs Revenue¶

In [15]:
plt.figure(figsize=(7,5))
sns.scatterplot(x=df["Discount Percentage"], y=df["Sales Revenue (USD)"])
plt.title("Effect of Discount on Sales Revenue")
plt.show()
In [20]:
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()
C:\Users\syed sahel\AppData\Local\Temp\ipykernel_1164\2585287783.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
In [37]:
# Revenue per unit
df["Revenue_per_Unit"] = df["Sales Revenue (USD)"] / df["Units Sold"]

# Discount impact score
df["Discount_Impact"] = df["Units Sold"] * df["Discount Percentage"]

# Holiday impact multiplied by unit sales
df["Holiday_Sales_Boost"] = df["Holiday Effect"] * df["Units Sold"]
df
Out[37]:
Store ID Product ID Date Units Sold Sales Revenue (USD) Discount Percentage Marketing Spend (USD) Store Location Product Category Day of the Week Holiday Effect Year Month Day Week Revenue_per_Unit Discount_Impact Holiday_Sales_Boost
0 Spearsland 52372247 2022-01-01 9 2741.69 20 81 Tanzania Furniture Saturday False 2022 1 1 52 304.632222 180 0
1 Spearsland 52372247 2022-01-02 7 2665.53 0 0 Mauritania Furniture Sunday False 2022 1 2 52 380.790000 0 0
2 Spearsland 52372247 2022-01-03 1 380.79 0 0 Saint Pierre and Miquelon Furniture Monday False 2022 1 3 1 380.790000 0 0
3 Spearsland 52372247 2022-01-04 4 1523.16 0 0 Australia Furniture Tuesday False 2022 1 4 1 380.790000 0 0
4 Spearsland 52372247 2022-01-05 2 761.58 0 0 Swaziland Furniture Wednesday False 2022 1 5 1 380.790000 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
29995 Spearsland 50239115 2022-01-25 5 2501.15 0 100 Malawi Clothing Tuesday False 2022 1 25 4 500.230000 0 0
29996 Spearsland 50239115 2022-01-26 3 1500.69 0 0 Sudan Clothing Wednesday False 2022 1 26 4 500.230000 0 0
29997 Spearsland 50239115 2022-01-27 6 3001.38 0 0 South Georgia and the South Sandwich Islands Clothing Thursday False 2022 1 27 4 500.230000 0 0
29998 Spearsland 50239115 2022-01-28 5 2501.15 0 0 Haiti Clothing Friday False 2022 1 28 4 500.230000 0 0
29999 Spearsland 50239115 2022-01-29 3 1425.66 5 190 Mozambique Clothing Saturday False 2022 1 29 4 475.220000 15 0

30000 rows × 18 columns

Predict Future Revenue Feature Selection

In [22]:
feature_cols = [
    "Units Sold",
    "Discount Percentage",
    "Marketing Spend (USD)",
    "Holiday Effect",
    "Month",
    "Year"
]

X = df[feature_cols]
y = df["Sales Revenue (USD)"]
In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
In [24]:
model = LinearRegression()
model.fit(X_train, y_train)
Out[24]:
LinearRegression()
In [25]:
pred = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, pred))
print("MSE:", mean_squared_error(y_test, pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, pred)))
MAE: 1553.4563867802353
MSE: 4136687.2297183075
RMSE: 2033.8847631363749

📌 7️⃣ Predict Future Revenue

In [29]:
future = pd.DataFrame({
    "Units Sold": [120],
    "Discount Percentage": [15],
    "Marketing Spend (USD)": [500],
    "Holiday Effect": [1],
    "Month": [12],
    "Year": [2025]
})

future_prediction = model.predict(future)
future_prediction
Out[29]:
array([56176.93734149])

📌 8️⃣ Interactive Dashboard (Plotly)

In [27]:
fig = px.line(df, x="Date", y="Sales Revenue (USD)", title="Interactive Sales Revenue Trend")
fig.show()

fig2 = px.bar(df, x="Product Category", y="Units Sold", color="Product Category", title="Units Sold by Category")
fig2.show()

fig3 = px.scatter(df, x="Marketing Spend (USD)", y="Sales Revenue (USD)", title="Marketing Spend vs Revenue")
fig3.show()
In [ ]: